If you use this code in your work or research, we kindly request that you cite our publication:
Xiaofan Lu, et al. (2025). FigureYa: A Standardized Visualization Framework for Enhancing Biomedical Data Interpretation and Research Efficiency. iMetaMed. https://doi.org/10.1002/imm3.70005
单细胞CNV的计算和画图。 Calculate and plot single-cell CNVs.
Figure 1. Characterizing Intra-tumoral Expression Heterogeneity in HNSCC by Single-Cell RNA-Seq. (B) Heatmap shows large-scale CNVs for individual cells (rows) from a representative tumor (MEEI5), inferred based on the average expression of 100 genes surrounding each chromosomal position (columns). Red: amplifications; blue: deletions.
出自https://www.cell.com/cell/fulltext/S0092-8674(17)31270-9 From https://www.cell.com/cell/fulltext/S0092-8674(17)31270-9
其他文章里类似的图: Similar pictures from other articles:
Fig. 1. Dissection of melanoma with single-cell RNA-seq. (B) Chromosomal landscape of inferred large-scale CNVs allows us to distinguish malignant from nonmalignant cells. The Mel80 tumor is shown with individual cells (y axis) and chromosomal regions (x axis). Amplifications (red) or deletions (blue) were inferred by averaging expression over 100-gene stretches on the respective chromosomes.
用单细胞RNA-seq数据计算CNV,对比展示多组之间的差异。 Calculate CNVs using single-cell RNA-seq data and compare differences between multiple groups.
source("install_dependencies.R")
## Starting R package installation...
## ===========================================
##
## Installing CRAN packages...
## Installing CRAN package: Dendritic
## 将程序包安装入'C:/Users/12063/AppData/Local/R/win-library/4.2'
## (因为'lib'没有被指定)
## Warning: package 'Dendritic' is not available for this version of R
##
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
## Successfully installed: Dendritic
## Installing CRAN package: Endothelial
## 将程序包安装入'C:/Users/12063/AppData/Local/R/win-library/4.2'
## (因为'lib'没有被指定)
## Warning: package 'Endothelial' is not available for this version of R
##
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
## Successfully installed: Endothelial
## Installing CRAN package: Fibroblast
## 将程序包安装入'C:/Users/12063/AppData/Local/R/win-library/4.2'
## (因为'lib'没有被指定)
## Warning: package 'Fibroblast' is not available for this version of R
##
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
## Successfully installed: Fibroblast
## Installing CRAN package: Macrophage
## 将程序包安装入'C:/Users/12063/AppData/Local/R/win-library/4.2'
## (因为'lib'没有被指定)
## Warning: package 'Macrophage' is not available for this version of R
##
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
## Successfully installed: Macrophage
## Installing CRAN package: Mast
## 将程序包安装入'C:/Users/12063/AppData/Local/R/win-library/4.2'
## (因为'lib'没有被指定)
## Warning: package 'Mast' is not available for this version of R
##
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
## Successfully installed: Mast
## Package already installed: caTools
## Package already installed: dplyr
## Package already installed: magrittr
## Installing CRAN package: myocyte
## 将程序包安装入'C:/Users/12063/AppData/Local/R/win-library/4.2'
## (因为'lib'没有被指定)
## Warning: package 'myocyte' is not available for this version of R
##
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
## Successfully installed: myocyte
## Package already installed: pheatmap
##
## Installing Bioconductor packages...
## Package already installed: org.Hs.eg.db
##
## ===========================================
## Package installation completed!
## You can now run your R scripts in this directory.
library(org.Hs.eg.db)
## 载入需要的程辑包:AnnotationDbi
## 载入需要的程辑包:stats4
## 载入需要的程辑包:BiocGenerics
##
## 载入程辑包:'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
## 载入需要的程辑包:Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## 载入需要的程辑包:IRanges
## 载入需要的程辑包:S4Vectors
##
## 载入程辑包:'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
##
## 载入程辑包:'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
##
library(magrittr)
library(dplyr)
## Warning: 程辑包'dplyr'是用R版本4.2.3 来建造的
##
## 载入程辑包:'dplyr'
## The following object is masked from 'package:AnnotationDbi':
##
## select
## The following objects are masked from 'package:IRanges':
##
## collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
##
## first, intersect, rename, setdiff, setequal, union
## The following object is masked from 'package:Biobase':
##
## combine
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:dbplyr':
##
## ident, sql
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caTools)
##
## 载入程辑包:'caTools'
## The following object is masked from 'package:IRanges':
##
## runmean
## The following object is masked from 'package:S4Vectors':
##
## runmean
library(pheatmap)
Sys.setenv(LANGUAGE = "en") #显示英文报错信息 # Display English error messages
options(stringsAsFactors = FALSE) #禁止chr转成factor # Disable conversion of chr to factor
自定义函数 Custom Function
minmax <- function(x, min, max ){
x[x > max] <- max
x[x < min] <- min
return(x)
}
GSE103322_HNSCC_all_data.txt.gz,这里以原文的数据为例,其他单细胞RNA-seq数据与此类似,包含表达矩阵和样品分组等信息meta data。
GSE103322_HNSCC_all_data.txt.gz. This example uses the original data. Other single-cell RNA-seq datasets are similar and include metadata such as expression matrix and sample grouping.
Download GSE103322_HNSCC_all_data.txt.gz from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE103322
# read cell annotation information
annodata <- as.data.frame(t(
read.table(file = gzfile("GSE103322_HNSCC_all_data.txt.gz"),
sep = "\t", header = T, row.names = 1, nrows = 5)
))
annodata[1:3,1:3]
## processed by Maxima enzyme Lymph node
## HN28_P15_D06_S330_comb 1 1
## HN28_P6_G05_S173_comb 1 0
## HN26_P14_D11_S239_comb 1 1
## classified as cancer cell
## HN28_P15_D06_S330_comb 0
## HN28_P6_G05_S173_comb 0
## HN26_P14_D11_S239_comb 1
# authors did not provide tumor ID for each single cell? horriable...
tmp <- reshape2::colsplit(gsub(pattern = "HNSCC_17", replacement = "HNSCC17", x = rownames(annodata)),
"_", names = letters[1:10]) #
annodata$tumor <- gsub(pattern = "HN|HNSCC", replacement = "MEEI", tmp$a, ignore.case = T)
head(tmp)
## a b c d e f g h i j
## 1 HN28 P15 D06 S330 comb NA NA NA
## 2 HN28 P6 G05 S173 comb NA NA NA
## 3 HN26 P14 D11 S239 comb NA NA NA
## 4 HN26 P14 H05 S281 comb NA NA NA
## 5 HN26 P25 H09 S189 comb NA NA NA
## 6 HN26 P14 H06 S282 comb NA NA NA
# read normalized expression values
exprdata <- read.table(file = gzfile("GSE103322_HNSCC_all_data.txt.gz"),
sep = "\t", header = T, row.names = 1, skip = 5)
colnames(exprdata) <- rownames(annodata)
exprdata[1:3, 1:3]
## HN28_P15_D06_S330_comb HN28_P6_G05_S173_comb HN26_P14_D11_S239_comb
## C9orf152 0.0000 0.0000 0.42761
## RPS11 6.0037 7.3006 7.28850
## ELMO2 0.0000 0.0000 0.00000
# you may save variables into a Rdata file for quicker data reloading.
save(exprdata, annodata, file = "data.Rdata")
Load and preprocess data
We used the remaining cells (k = 5,902) to identify genes that are expressed at high or intermediate levels by calculating the aggregate
expression of each gene i across the k cells, as Ea(i) = log2(average(TPM(i)1.k)+1), and excluded genes with Ea < 4. For the remaining
cells and genes, we defined relative expression by centering the expression levels, Eri,j = Ei,j-average[Ei,1.k]. The relative
expression levels, across the remaining subset of cells and genes, were used for downstream analysis.
varList <- load(file = "data.Rdata")
varList
## [1] "exprdata" "annodata"
# all cells are high quality with more than 2000 genes in individual cell.
annodata$gene_number <- colSums(exprdata > 0)
# filter genes with high or intermediate expression levels
# Note: This step of filtering out lowly expressed genes is and necessary, otherwise the CNV estimation result may de distorted.
tpmdata <- 10*(2^exprdata-1)
gene_average <- apply(tpmdata, 1, function(x){ log2(mean(x)+1)})
gene_mask <- gene_average > 4
sum(gene_mask)/length(gene_mask)
## [1] 0.3113231
exprdata <- exprdata[gene_mask,]
# sort genes by genomic location
gene_loc <- AnnotationDbi::select(org.Hs.eg.db, keys = rownames(exprdata),
columns = c( "CHRLOC"),
keytype = "SYMBOL")
## Warning in .deprecatedColsMessage(): Accessing gene location information via 'CHR','CHRLOC','CHRLOCEND' is
## deprecated. Please use a range based accessor like genes(), or select()
## with columns values like TXCHROM and TXSTART on a TxDb or OrganismDb
## object instead.
## 'select()' returned 1:many mapping between keys and columns
chr_used <- c(as.character(1:22),"X")
gene_loc %<>%
dplyr::filter(CHRLOCCHR %in% chr_used) %<>% # filter out scaffold
dplyr::mutate(chr = factor(CHRLOCCHR, levels = chr_used)) %<>% # set levels for chr
dplyr::arrange(chr, abs(CHRLOC)) # sort genes by genomic location
gene_loc_uniq <- gene_loc[!duplicated(gene_loc$SYMBOL),] # gene deduplication
# set row-order of exprdata
exprdata <- exprdata[gene_loc_uniq$SYMBOL, ]
# get relative expression by centering (note: no scaling)
reladata <- sweep(exprdata, 1, rowMeans(exprdata))
# bound data into [-3, 3]
reladata <- minmax(reladata, -3, 3)
window_length <- 100
# initial CNVs
initial_cnv <- NA + reladata # genes with location information were used for downstream analyses
for (chr in chr_used) {
chr_genes = gene_loc_uniq$SYMBOL[gene_loc_uniq$chr == chr]
chr_data = reladata[chr_genes, , drop = FALSE]
if (nrow(chr_data) > 1) {
chr_data = apply(chr_data, 2, caTools::runmean, k = window_length)
initial_cnv[chr_genes, ] <- chr_data
}else{
print(chr)
}
}
which(is.na(initial_cnv)) # check inistal_cnv
## integer(0)
# re-centering data across chromosome after smoothing, see (Patel et al., 2014)
initial_cnv <- sweep(initial_cnv, 2, apply(initial_cnv, 2, median), FUN = "-")
# initial CNV score of each single-cell
initial_cnv_score <- colMeans(initial_cnv^2)
# initial CNV correlation score
cell_in_which_tumor <- annodata$tumor
initial_cnv_score_tumor_profile <- sapply(unique(cell_in_which_tumor), function(x){
rowMeans(initial_cnv[, cell_in_which_tumor == x])
})
initial_cnv_corr <- sapply(colnames(initial_cnv), function(x) {
cor(x = as.numeric(initial_cnv[, x, drop = T]), y = initial_cnv_score_tumor_profile[,annodata[x,"tumor"]])
})
initial_cnv_score_threshold <- 0.05
initial_cnv_corr_threshold <- 0.5
# putative maglignant cells
initital_putative_maglignant <- names(which(
initial_cnv_score > initial_cnv_score_threshold &
initial_cnv_corr > initial_cnv_corr_threshold
))
# putative non-maglignant cells
initital_putative_non_maglignant <- names(which(
initial_cnv_score < initial_cnv_score_threshold &
initial_cnv_corr < initial_cnv_corr_threshold
))
table(annodata[initital_putative_maglignant, "classified as cancer cell"])
##
## 0 1
## 1 1324
table(annodata[initital_putative_non_maglignant, "classified as non-cancer cells"])
##
## 0 1
## 755 3004
annodata_base <- annodata[initital_putative_non_maglignant, c("non-cancer cell type"), drop = F]
table(annodata_base$`non-cancer cell type`)
##
## -Fibroblast 0 B cell Dendritic Endothelial Fibroblast
## 10 755 124 30 244 1332
## Macrophage Mast myocyte T cell
## 96 116 19 1033
types_used <- c("B cell", "Dendritic", "Endothelial", "Fibroblast", "Macrophage", "Mast", "myocyte", "T cell")
annodata_base <- annodata_base[annodata_base$`non-cancer cell type` %in% types_used,,drop = F]
# baseline
baseline_cnv <- as.matrix(t(apply(initial_cnv[,rownames(annodata_base)], 1, function(x) {
tapply(x, annodata_base$`non-cancer cell type`, mean)
})))
baseline_max <- matrix(rowMax(baseline_cnv),
nrow = nrow(initial_cnv),
ncol = ncol(initial_cnv),
dimnames = dimnames(initial_cnv))
baseline_min <- matrix(rowMin(baseline_cnv),
nrow = nrow(initial_cnv),
ncol = ncol(initial_cnv),
dimnames = dimnames(initial_cnv))
# final CNVs
final_cnv <- 0* initial_cnv
final_cnv[initial_cnv > baseline_max + 0.2] <- (initial_cnv - baseline_max)[initial_cnv > baseline_max + 0.2]
final_cnv[initial_cnv < baseline_min - 0.2] <- (initial_cnv - baseline_min)[initial_cnv < baseline_min - 0.2]
# re-centering data across chromosome after smoothing
# final_cnv <- sweep(final_cnv, 2, apply(final_cnv, 2, median), FUN = "-")
用pheatmap画图
Use pheatmap to plot
annodata$type <- factor(ifelse(annodata$`classified as non-cancer cells` == 1,
"Non-malignant",
ifelse(annodata$`classified as cancer cell` == 1 & annodata$`Lymph node` == 0,
"Malignant (primary)",
"Malignant (LN)"
)),
levels = c("Non-malignant", "Malignant (primary)","Malignant (LN)"))
phAnno <- annodata[annodata$tumor == "MEEI5",]
phAnno <- phAnno[order(phAnno$type),]
phData <- t(final_cnv[,rownames(phAnno)])
phData <- minmax(phData, min = -1, max = 1)
pheatmap(phData,
color = colorRampPalette(c("darkblue", "blue", "grey90", "red", "red4"),
interpolate = "linear")(11),
# breaks = c(seq(-1, -0.1, length.out = 50),
# seq(0.1, 1, length.out = 50)),
annotation_row = phAnno[, "type", drop = F],
gaps_col = cumsum(table(gene_loc_uniq$chr)),
cluster_rows = F, cluster_cols = F,
show_colnames = F, show_rownames = F,
filename = "scCNV.pdf")